# -*- coding: utf-8 -*-
pip install plotly
Requirement already satisfied: plotly in /Users/duanxiaoran/anaconda3/lib/python3.11/site-packages (5.9.0) Requirement already satisfied: tenacity>=6.2.0 in /Users/duanxiaoran/anaconda3/lib/python3.11/site-packages (from plotly) (8.2.2) Note: you may need to restart the kernel to use updated packages.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
df = pd.read_csv('/Users/duanxiaoran/Downloads/Global YouTube Statistics.csv', encoding='ISO-8859-1')
print(df.head())
rank Youtuber subscribers video views \
0 1 T-Series 245000000 2.280000e+11
1 2 YouTube Movies 170000000 0.000000e+00
2 3 MrBeast 166000000 2.836884e+10
3 4 Cocomelon - Nursery Rhymes 162000000 1.640000e+11
4 5 SET India 159000000 1.480000e+11
category Title uploads Country \
0 Music T-Series 20082 India
1 Film & Animation youtubemovies 1 United States
2 Entertainment MrBeast 741 United States
3 Education Cocomelon - Nursery Rhymes 966 United States
4 Shows SET India 116536 India
Abbreviation channel_type ... subscribers_for_last_30_days \
0 IN Music ... 2000000.0
1 US Games ... NaN
2 US Entertainment ... 8000000.0
3 US Education ... 1000000.0
4 IN Entertainment ... 1000000.0
created_year created_month created_date \
0 2006.0 Mar 13.0
1 2006.0 Mar 5.0
2 2012.0 Feb 20.0
3 2006.0 Sep 1.0
4 2006.0 Sep 20.0
Gross tertiary education enrollment (%) Population Unemployment rate \
0 28.1 1.366418e+09 5.36
1 88.2 3.282395e+08 14.70
2 88.2 3.282395e+08 14.70
3 88.2 3.282395e+08 14.70
4 28.1 1.366418e+09 5.36
Urban_population Latitude Longitude
0 471031528.0 20.593684 78.962880
1 270663028.0 37.090240 -95.712891
2 270663028.0 37.090240 -95.712891
3 270663028.0 37.090240 -95.712891
4 471031528.0 20.593684 78.962880
[5 rows x 28 columns]
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 995 entries, 0 to 994 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 rank 995 non-null int64 1 Youtuber 995 non-null object 2 subscribers 995 non-null int64 3 video views 995 non-null float64 4 category 949 non-null object 5 Title 995 non-null object 6 uploads 995 non-null int64 7 Country 873 non-null object 8 Abbreviation 873 non-null object 9 channel_type 965 non-null object 10 video_views_rank 994 non-null float64 11 country_rank 879 non-null float64 12 channel_type_rank 962 non-null float64 13 video_views_for_the_last_30_days 939 non-null float64 14 lowest_monthly_earnings 995 non-null float64 15 highest_monthly_earnings 995 non-null float64 16 lowest_yearly_earnings 995 non-null float64 17 highest_yearly_earnings 995 non-null float64 18 subscribers_for_last_30_days 658 non-null float64 19 created_year 990 non-null float64 20 created_month 990 non-null object 21 created_date 990 non-null float64 22 Gross tertiary education enrollment (%) 872 non-null float64 23 Population 872 non-null float64 24 Unemployment rate 872 non-null float64 25 Urban_population 872 non-null float64 26 Latitude 872 non-null float64 27 Longitude 872 non-null float64 dtypes: float64(18), int64(3), object(7) memory usage: 217.8+ KB
df.duplicated(keep=False)
0 False
1 False
2 False
3 False
4 False
...
990 False
991 False
992 False
993 False
994 False
Length: 995, dtype: bool
df.drop_duplicates(inplace = True)
country_counts = df['Country'].value_counts().reset_index()
country_counts.columns = ['Country', 'Count']
fig = px.pie(country_counts, names='Country', values='Count', title='Percentage of YouTubers in Different Countries')
fig.show()
us_youtubers = df[df['Country'] == 'United States']
video_type_counts = us_youtubers['category'].value_counts().reset_index()
video_type_counts.columns = ['Video Type', 'Count']
fig = px.bar(video_type_counts, x='Video Type', y='Count', title='U.S. YouTubers share of different types of videos')
fig.show()
uk_youtubers = df[df['Country'] == 'United Kingdom']
video_type_counts_uk = uk_youtubers['category'].value_counts().reset_index()
video_type_counts_uk.columns = ['Video Type', 'Count']
fig_uk = px.bar(video_type_counts_uk, x='Video Type', y='Count', title='UK YouTubers share of different types of videos')
fig_uk.show()
australia_youtubers = df[df['Country'] == 'Australia']
video_type_counts_australia = australia_youtubers['category'].value_counts().reset_index()
video_type_counts_australia.columns = ['Video Type', 'Count']
fig_australia = px.bar(video_type_counts_australia, x='Video Type', y='Count', title='AU YouTubers share of different types of videos')
fig_australia.show()
canada_youtubers = df[df['Country'] == 'Canada']
video_type_counts_canada = canada_youtubers['category'].value_counts().reset_index()
video_type_counts_canada.columns = ['Video Type', 'Count']
fig_canada = px.bar(video_type_counts_canada, x='Video Type', y='Count', title='Canada YouTubers share of different types of videos')
fig_canada.show()
selected_countries = ['United States', 'United Kingdom', 'Australia', 'Canada']
selected_youtubers = df[df['Country'].isin(selected_countries)]
video_type_income = selected_youtubers.groupby('category')['lowest_yearly_earnings'].sum().reset_index()
video_type_income.columns = ['Video Type', 'Total Income']
fig_income = px.bar(video_type_income, x='Video Type', y='Total Income', title='YouTubers total revenue from different video types in different countries')
fig_income.show()
# people & blog
people_blog_data = df[df['category'] == 'People & Blogs']
fig = px.scatter(people_blog_data, x='subscribers', y='lowest_yearly_earnings',
title='People & Blogs video type revenue and number of subs',
labels={'subscribers': 'subs', 'lowest_yearly_earnings': 'lowest earnings'})
fig.show()
people_blog_data = df[df['category'] == 'People & Blogs']
fig = px.scatter(people_blog_data, x='subscribers', y='lowest_yearly_earnings',
title='People & Blogs video type revenue and number of subs',
labels={'subscribers': 'subs', 'lowest_yearly_earnings': 'lowest earnings'},
trendline='ols')
fig.show()
# Gaming
gaming_data = df[df['category'] == 'Gaming']
fig_gaming = px.scatter(gaming_data, x='subscribers', y='lowest_yearly_earnings',
title='Gaming',
labels={'subscribers': 'subs', 'lowest_yearly_earnings': 'earnings'},
trendline='ols')
fig_gaming.show()
# Music
music_data = df[df['category'] == 'Music']
fig_music = px.scatter(music_data, x='subscribers', y='lowest_yearly_earnings',
title='Music',
labels={'subscribers': 'subs', 'lowest_yearly_earnings': 'earnings'},
trendline='ols')
fig_music.show()
# Entertainment
entertainment_data = df[df['category'] == 'Entertainment']
fig_entertainment = px.scatter(entertainment_data, x='subscribers', y='lowest_yearly_earnings',
title='Entertainment',
labels={'subscribers': 'subs', 'lowest_yearly_earnings': 'earnings'},
trendline='ols')
fig_entertainment.show()
import plotly.graph_objects as go
from plotly.subplots import make_subplots
fig = make_subplots(rows=2, cols=2, subplot_titles=['Gaming', 'Music', 'Entertainment', 'People & Blogs'])
gaming_data = df[df['category'] == 'Gaming']
music_data = df[df['category'] == 'Music']
entertainment_data = df[df['category'] == 'Entertainment']
people_blog_data = df[df['category'] == 'People & Blogs']
fig.add_trace(go.Scatter(x=gaming_data['subscribers'], y=gaming_data['lowest_yearly_earnings'],
mode='markers', name='Gaming'),
row=1, col=1)
fig.add_trace(go.Scatter(x=music_data['subscribers'], y=music_data['lowest_yearly_earnings'],
mode='markers', name='Music'),
row=1, col=2)
fig.add_trace(go.Scatter(x=entertainment_data['subscribers'], y=entertainment_data['lowest_yearly_earnings'],
mode='markers', name='Entertainment'),
row=2, col=1)
fig.add_trace(go.Scatter(x=people_blog_data['subscribers'], y=people_blog_data['lowest_yearly_earnings'],
mode='markers', name='People & Blogs'),
row=2, col=2)
fig.update_layout(title_text='Revenue from different video types', showlegend=True)
fig.update_xaxes(title_text='subs', row=1, col=1)
fig.update_xaxes(title_text='subs', row=1, col=2)
fig.update_xaxes(title_text='subs', row=2, col=1)
fig.update_xaxes(title_text='subs', row=2, col=2)
fig.update_yaxes(title_text='earnings', row=1, col=1)
fig.update_yaxes(title_text='earnings', row=1, col=2)
fig.update_yaxes(title_text='earnings', row=2, col=1)
fig.update_yaxes(title_text='earnings', row=2, col=2)
fig.show()
fig = make_subplots(rows=1, cols=1, subplot_titles=['Revenue from different video types'])
gaming_data = df[df['category'] == 'Gaming']
music_data = df[df['category'] == 'Music']
entertainment_data = df[df['category'] == 'Entertainment']
people_blog_data = df[df['category'] == 'People & Blogs']
fig.add_trace(go.Scatter(x=gaming_data['subscribers'], y=gaming_data['lowest_yearly_earnings'],
mode='markers', name='Gaming', legendgroup='Gaming'))
fig.add_trace(go.Scatter(x=music_data['subscribers'], y=music_data['lowest_yearly_earnings'],
mode='markers', name='Music', legendgroup='Music'))
fig.add_trace(go.Scatter(x=entertainment_data['subscribers'], y=entertainment_data['lowest_yearly_earnings'],
mode='markers', name='Entertainment', legendgroup='Entertainment'))
fig.add_trace(go.Scatter(x=people_blog_data['subscribers'], y=people_blog_data['lowest_yearly_earnings'],
mode='markers', name='People & Blogs', legendgroup='People & Blogs'))
def add_trendline(data, name):
slope, intercept, r_value, p_value, std_err = linregress(data['subscribers'], data['lowest_yearly_earnings'])
x_fit = np.linspace(data['subscribers'].min(), data['subscribers'].max(), 100)
y_fit = intercept + slope * x_fit
fig.add_trace(go.Scatter(x=x_fit, y=y_fit, mode='lines', name=f'{name} Trendline', legendgroup=name))
add_trendline(gaming_data, 'Gaming')
add_trendline(music_data, 'Music')
add_trendline(entertainment_data, 'Entertainment')
add_trendline(people_blog_data, 'People & Blogs')
fig.update_xaxes(title_text='subs')
fig.update_yaxes(title_text='earnings')
fig.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[26], line 23 20 y_fit = intercept + slope * x_fit 21 fig.add_trace(go.Scatter(x=x_fit, y=y_fit, mode='lines', name=f'{name} Trendline', legendgroup=name)) ---> 23 add_trendline(gaming_data, 'Gaming') 24 add_trendline(music_data, 'Music') 25 add_trendline(entertainment_data, 'Entertainment') Cell In[26], line 18, in add_trendline(data, name) 17 def add_trendline(data, name): ---> 18 slope, intercept, r_value, p_value, std_err = linregress(data['subscribers'], data['lowest_yearly_earnings']) 19 x_fit = np.linspace(data['subscribers'].min(), data['subscribers'].max(), 100) 20 y_fit = intercept + slope * x_fit NameError: name 'linregress' is not defined
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from scipy.stats import linregress
fig = make_subplots(rows=1, cols=1, subplot_titles=['Revenue from different video types'])
gaming_data = df[df['category'] == 'Gaming']
music_data = df[df['category'] == 'Music']
entertainment_data = df[df['category'] == 'Entertainment']
people_blog_data = df[df['category'] == 'People & Blogs']
fig.add_trace(go.Scatter(x=gaming_data['subscribers'], y=gaming_data['lowest_yearly_earnings'],
mode='markers', name='Gaming', legendgroup='Gaming'))
fig.add_trace(go.Scatter(x=music_data['subscribers'], y=music_data['lowest_yearly_earnings'],
mode='markers', name='Music', legendgroup='Music'))
fig.add_trace(go.Scatter(x=entertainment_data['subscribers'], y=entertainment_data['lowest_yearly_earnings'],
mode='markers', name='Entertainment', legendgroup='Entertainment'))
fig.add_trace(go.Scatter(x=people_blog_data['subscribers'], y=people_blog_data['lowest_yearly_earnings'],
mode='markers', name='People & Blogs', legendgroup='People & Blogs'))
def add_trendline(data, name):
slope, intercept, r_value, p_value, std_err = linregress(data['subscribers'], data['lowest_yearly_earnings'])
x_fit = np.linspace(data['subscribers'].min(), data['subscribers'].max(), 100)
y_fit = intercept + slope * x_fit
fig.add_trace(go.Scatter(x=x_fit, y=y_fit, mode='lines', name=f'{name} Trendline', legendgroup=name))
add_trendline(gaming_data, 'Gaming')
add_trendline(music_data, 'Music')
add_trendline(entertainment_data, 'Entertainment')
add_trendline(people_blog_data, 'People & Blogs')
fig.update_xaxes(title_text='subs')
fig.update_yaxes(title_text='earnings')
fig.show()